Read Dataset

ds = read.csv("AB_NYC_2019.csv")
head(ds)
##     id                                             name host_id   host_name
## 1 2539               Clean & quiet apt home by the park    2787        John
## 2 2595                            Skylit Midtown Castle    2845    Jennifer
## 3 3647              THE VILLAGE OF HARLEM....NEW YORK !    4632   Elisabeth
## 4 3831                  Cozy Entire Floor of Brownstone    4869 LisaRoxanne
## 5 5022 Entire Apt: Spacious Studio/Loft by central park    7192       Laura
## 6 5099        Large Cozy 1 BR Apartment In Midtown East    7322       Chris
##   neighbourhood_group neighbourhood latitude longitude       room_type price
## 1            Brooklyn    Kensington 40.64749 -73.97237    Private room   149
## 2           Manhattan       Midtown 40.75362 -73.98377 Entire home/apt   225
## 3           Manhattan        Harlem 40.80902 -73.94190    Private room   150
## 4            Brooklyn  Clinton Hill 40.68514 -73.95976 Entire home/apt    89
## 5           Manhattan   East Harlem 40.79851 -73.94399 Entire home/apt    80
## 6           Manhattan   Murray Hill 40.74767 -73.97500 Entire home/apt   200
##   minimum_nights number_of_reviews last_review reviews_per_month
## 1              1                 9  2018-10-19              0.21
## 2              1                45  2019-05-21              0.38
## 3              3                 0                            NA
## 4              1               270  2019-07-05              4.64
## 5             10                 9  2018-11-19              0.10
## 6              3                74  2019-06-22              0.59
##   calculated_host_listings_count availability_365
## 1                              6              365
## 2                              2              355
## 3                              1              365
## 4                              1              194
## 5                              1                0
## 6                              1              129

Data cleaning

Check for NA and NULL values

#Check for NA
apply(ds,2,function(x) sum(is.na(x)))
##                             id                           name 
##                              0                              0 
##                        host_id                      host_name 
##                              0                              0 
##            neighbourhood_group                  neighbourhood 
##                              0                              0 
##                       latitude                      longitude 
##                              0                              0 
##                      room_type                          price 
##                              0                              0 
##                 minimum_nights              number_of_reviews 
##                              0                              0 
##                    last_review              reviews_per_month 
##                              0                          10052 
## calculated_host_listings_count               availability_365 
##                              0                              0
# NOTES
# Remove NA, empty
#
#
#
#

Normalisation and selection of the variables

normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}


clean_data = function(ds)
{
  ds = select (ds,-c(host_id, id, host_name, name,minimum_nights,number_of_reviews,neighbourhood,last_review,availability_365,
                     reviews_per_month,calculated_host_listings_count))
 

  numerical = c("price","longitude", "latitude")
  categorical = c("neighbourhood_group")
  
  ds[numerical] = scale(ds[numerical])
  ds$neighbourhood_group = factor(ds$neighbourhood_group, 
                                  level= c("Brooklyn","Manhattan",
                                           "Queens","Staten Island", "Bronx"),
                                  labels=c(1,2,3,4,5))
  ds$room_type = factor(ds$room_type, 
                        level= c("Private room","Entire home/apt","Shared room"), 
                        labels=c(1,2,3))
  
  return(ds)
}
#ggdraw() +
#  draw_image("New_York_City_.png") +
#  draw_plot(myplot)

dataset = clean_data(ds)

head(dataset)
##   neighbourhood_group   latitude  longitude room_type       price
## 1                   1 -1.4938339 -0.4376476         1 -0.01549291
## 2                   2  0.4524314 -0.6846321         2  0.30097047
## 3                   2  1.4683845  0.2224944         1 -0.01132892
## 4                   1 -0.8033893 -0.1644481         2 -0.26533242
## 5                   2  1.2756468  0.1772139         2 -0.30280835
## 6                   2  0.3433173 -0.4946274         2  0.19687067

Split data into train and test sets

library(caTools)
## Warning: package 'caTools' was built under R version 3.6.3
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
data_clean = dataset
sample = sample.split(data_clean, SplitRatio = .75)
train = subset(data_clean, sample == TRUE)
test  = subset(data_clean, sample == FALSE)

print("Initial data shape")
## [1] "Initial data shape"
print(dim(data_clean))
## [1] 48895     5
print("Train shape")
## [1] "Train shape"
print(dim(train))
## [1] 29337     5
print("Test shape")
## [1] "Test shape"
print(dim(test))
## [1] 19558     5

RANDOM FOREST

https://www.guru99.com/r-random-forest-tutorial.html

GOOD VIDEO FOR PARAMETER OPTIMISATION https://www.youtube.com/watch?v=6EXPYzbfLCE

TUNING https://uc-r.github.io/random_forests

library(randomForest)
## Warning: package 'randomForest' was built under R version 3.6.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
rf <- randomForest(
  neighbourhood_group ~ . ,
  data=train,
 # importance = TRUE
)
rf
## 
## Call:
##  randomForest(formula = neighbourhood_group ~ ., data = train,      ) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 0.13%
## Confusion matrix:
##       1     2    3   4   5  class.error
## 1 12083     1   12   0   0 0.0010747354
## 2     0 12947    0   0   2 0.0001544521
## 3    16     2 3387   0   0 0.0052863436
## 4     0     0    0 239   0 0.0000000000
## 5     0     4    0   0 644 0.0061728395
#https://www.r-bloggers.com/how-to-implement-random-forests-in-r/
varImpPlot(rf)

pred = predict(rf, test)
head(pred)
##  2  4  7  9 12 14 
##  2  1  1  2  2  2 
## Levels: 1 2 3 4 5
plot(pred)

table(pred,test$neighbourhood_group)
##     
## pred    1    2    3    4    5
##    1 8005    2    7    0    0
##    2    1 8710    0    0    4
##    3    2    0 2253    0    0
##    4    0    0    0  134    0
##    5    0    0    1    0  439
confusionMatrix(pred, test$neighbourhood_group)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2    3    4    5
##          1 8005    2    7    0    0
##          2    1 8710    0    0    4
##          3    2    0 2253    0    0
##          4    0    0    0  134    0
##          5    0    0    1    0  439
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9991          
##                  95% CI : (0.9986, 0.9995)
##     No Information Rate : 0.4454          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9986          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity            0.9996   0.9998   0.9965 1.000000  0.99097
## Specificity            0.9992   0.9995   0.9999 1.000000  0.99995
## Pos Pred Value         0.9989   0.9994   0.9991 1.000000  0.99773
## Neg Pred Value         0.9997   0.9998   0.9995 1.000000  0.99979
## Prevalence             0.4094   0.4454   0.1156 0.006851  0.02265
## Detection Rate         0.4093   0.4453   0.1152 0.006851  0.02245
## Detection Prevalence   0.4098   0.4456   0.1153 0.006851  0.02250
## Balanced Accuracy      0.9994   0.9997   0.9982 1.000000  0.99546
rf2 <- randomForest(
  price ~ . ,
  data=train,
  # importance = TRUE
)
rf2
## 
## Call:
##  randomForest(formula = price ~ ., data = train, ) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.8848527
##                     % Var explained: 9.53
varImpPlot(rf2)

par(mfrow=c(1,2))
plot(test$price,predict(rf2,test),col='red',main='Real vs predicted RF',pch=18,cex=0.7)
abline(0,1,lwd=2)
legend('bottomright',legend='NN',pch=18,col='red', bty='n')

library(randomForest)
library(e1071)
## Warning: package 'e1071' was built under R version 3.6.3
trControl <- trainControl(method = "cv",
                          number = 10,
                          search = "grid",
                          allowParallel = TRUE
                        )
trControl$method
## [1] "cv"
trControl$number
## [1] 10

Possible multi core tuning

registerDoFuture() plan(multiprocess, workers = availableCores() - 1)

library(doParallel) cl <- makeCluster(detectCores()) registerDoParallel(cl) #### machine learning code goes in here stopCluster(cl)

Calculate computational Time:

start.time <- Sys.time() …Relevent codes… end.time <- Sys.time() time.taken <- end.time - start.time time.taken

library(doParallel)
## Warning: package 'doParallel' was built under R version 3.6.3
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.6.3
## Loading required package: iterators
## Warning: package 'iterators' was built under R version 3.6.3
## Loading required package: parallel
library(doFuture)
## Warning: package 'doFuture' was built under R version 3.6.3
## Loading required package: globals
## Loading required package: future
## 
## Attaching package: 'future'
## The following object is masked from 'package:caret':
## 
##     cluster
start.time <- Sys.time()

#ptm <- proc.time()

rf_default <- caret::train(
  neighbourhood_group ~ . ,
  data=train,
  method = "rf",
  metric="Accuracy",
  trControl = trControl,
  num.threads = availableCores() # <- This one
  
)

end.time <- Sys.time()
time.taken <- end.time - start.time

print("-- Time: -- ")
## [1] "-- Time: -- "
time.taken
## Time difference of 5.188274 mins
print("")
## [1] ""
print("-- RANDOM FOREST -- ")
## [1] "-- RANDOM FOREST -- "
#proc.time() - ptm
print(rf_default)
## Random Forest 
## 
## 29337 samples
##     4 predictor
##     5 classes: '1', '2', '3', '4', '5' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 26402, 26402, 26405, 26405, 26403, 26403, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9977504  0.9963760
##   3     0.9986025  0.9977494
##   5     0.9983979  0.9974200
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
#ptm <- proc.time()
start.time <- Sys.time()

rf_default <- caret::train(
  neighbourhood_group ~ . ,
  data=train,
  method = "rf",
  metric="Accuracy",
  trControl = trControl,

)
#proc.time() - ptm
end.time <- Sys.time()
time.taken <- end.time - start.time

print("-- Time: -- ")
## [1] "-- Time: -- "
time.taken
## Time difference of 5.623895 mins
print("")
## [1] ""
print(rf_default)
## Random Forest 
## 
## 29337 samples
##     4 predictor
##     5 classes: '1', '2', '3', '4', '5' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 26402, 26403, 26403, 26404, 26402, 26403, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9979207  0.9966505
##   3     0.9988070  0.9980790
##   5     0.9986706  0.9978591
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
start.time <- Sys.time()

set.seed(1234)
tuneGrid <- expand.grid(.mtry = c(1: 10))
rf_mtry <- caret::train(
  neighbourhood_group~.,
  data = train,
  method = "rf",
  metric = "Accuracy",
  tuneGrid = tuneGrid,
  trControl = trControl,
  importance = TRUE,
  nodesize = 14,
  ntree = 300,
  num.threads = availableCores()-1
)
## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range

## Warning in randomForest.default(x, y, mtry = param$mtry, ...): invalid mtry:
## reset to within valid range
end.time <- Sys.time()
time.taken <- end.time - start.time

print("-- Time: -- ")
## [1] "-- Time: -- "
time.taken
## Time difference of 10.68638 mins
print("")
## [1] ""
print(rf_mtry)
## Random Forest 
## 
## 29337 samples
##     4 predictor
##     5 classes: '1', '2', '3', '4', '5' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 26404, 26403, 26402, 26403, 26403, 26404, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    1    0.9192489  0.8663930
##    2    0.9974435  0.9958825
##    3    0.9985684  0.9976949
##    4    0.9985002  0.9975854
##    5    0.9980911  0.9969263
##    6    0.9980911  0.9969265
##    7    0.9980911  0.9969263
##    8    0.9980571  0.9968714
##    9    0.9980230  0.9968167
##   10    0.9981593  0.9970361
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
best_mtry <- rf_mtry$bestTune$mtry 
cat("Best mtry value:",best_mtry)
## Best mtry value: 3
cat("\nMax accuracy mtry:", max(rf_mtry$results$Accuracy))
## 
## Max accuracy mtry: 0.9985684
start.time <- Sys.time()

store_maxnode <- list()
tuneGrid <- expand.grid(.mtry = best_mtry)
for (maxnodes in c(5: 15)) {
  set.seed(1234)
  rf_maxnode <- train(neighbourhood_group~.,
                      data = train,
                      method = "rf",
                      metric = "Accuracy",
                      tuneGrid = tuneGrid,
                      trControl = trControl,
                      importance = TRUE,
                      nodesize = 14,
                      maxnodes = maxnodes,
                      ntree = 300,
  )
  current_iteration <- toString(maxnodes)
  store_maxnode[[current_iteration]] <- rf_maxnode
}

end.time <- Sys.time()
time.taken <- end.time - start.time

print("-- Time: -- ")
## [1] "-- Time: -- "
time.taken
## Time difference of 7.612453 mins
print("")
## [1] ""
results_mtry <- resamples(store_maxnode)
summary(results_mtry)
## 
## Call:
## summary.resamples(object = results_mtry)
## 
## Models: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 
## Number of resamples: 10 
## 
## Accuracy 
##         Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## 5  0.8649847 0.8693425 0.8774922 0.8759245 0.8823079 0.8837368    0
## 6  0.8653256 0.8696834 0.8786850 0.8765380 0.8822626 0.8847596    0
## 7  0.8660075 0.8702284 0.8797072 0.8769128 0.8827139 0.8844187    0
## 8  0.8677122 0.8721134 0.8810301 0.8827754 0.8864081 0.9048756    0
## 9  0.8700989 0.8944766 0.9006476 0.8986593 0.9052407 0.9155026    0
## 10 0.9072938 0.9092459 0.9139543 0.9167942 0.9213263 0.9321976    0
## 11 0.9004432 0.9099121 0.9205728 0.9185315 0.9255470 0.9328333    0
## 12 0.9093079 0.9178385 0.9246888 0.9233723 0.9309640 0.9321976    0
## 13 0.9233129 0.9307024 0.9340605 0.9338720 0.9383150 0.9444255    0
## 14 0.9287419 0.9352253 0.9398434 0.9398368 0.9434270 0.9532901    0
## 15 0.9386294 0.9430811 0.9475109 0.9491422 0.9545801 0.9628367    0
## 
## Kappa 
##         Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## 5  0.7793972 0.7864172 0.7996329 0.7971574 0.8076095 0.8098218    0
## 6  0.7799906 0.7868578 0.8016329 0.7980972 0.8071669 0.8117301    0
## 7  0.7811803 0.7881542 0.8033289 0.7989014 0.8083707 0.8113753    0
## 8  0.7838301 0.7910356 0.8055847 0.8085243 0.8146300 0.8450767    0
## 9  0.7876248 0.8273720 0.8376029 0.8343309 0.8453941 0.8618689    0
## 10 0.8483833 0.8515134 0.8592176 0.8639096 0.8713883 0.8890951    0
## 11 0.8369987 0.8526091 0.8701376 0.8667766 0.8782680 0.8904073    0
## 12 0.8516275 0.8656820 0.8769131 0.8747760 0.8872923 0.8891170    0
## 13 0.8746033 0.8868985 0.8923119 0.8920415 0.8994440 0.9094351    0
## 14 0.8836125 0.8943161 0.9016622 0.9017820 0.9077138 0.9238400    0
## 15 0.8998846 0.9071320 0.9141669 0.9169273 0.9257702 0.9392863    0
start.time <- Sys.time()

for (maxnodes in c(15 : 30)) {
  set.seed(1234)
  rf_maxnode <- train(neighbourhood_group~.,
                      data = train,
                      method = "rf",
                      metric = "Accuracy",
                      tuneGrid = tuneGrid,
                      trControl = trControl,
                      importance = TRUE,
                      nodesize = 14,
                      maxnodes = maxnodes,
                      ntree = 300)
  current_iteration <- toString(maxnodes)
  store_maxnode[[current_iteration]] <- rf_maxnode
}

end.time <- Sys.time()
time.taken <- end.time - start.time

print("-- Time: -- ")
## [1] "-- Time: -- "
time.taken
## Time difference of 14.05081 mins
print("")
## [1] ""
results_mtry <- resamples(store_maxnode)
summary(results_mtry)
## 
## Call:
## summary.resamples(object = results_mtry)
## 
## Models: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 
## Number of resamples: 10 
## 
## Accuracy 
##         Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## 5  0.8649847 0.8693425 0.8774922 0.8759245 0.8823079 0.8837368    0
## 6  0.8653256 0.8696834 0.8786850 0.8765380 0.8822626 0.8847596    0
## 7  0.8660075 0.8702284 0.8797072 0.8769128 0.8827139 0.8844187    0
## 8  0.8677122 0.8721134 0.8810301 0.8827754 0.8864081 0.9048756    0
## 9  0.8700989 0.8944766 0.9006476 0.8986593 0.9052407 0.9155026    0
## 10 0.9072938 0.9092459 0.9139543 0.9167942 0.9213263 0.9321976    0
## 11 0.9004432 0.9099121 0.9205728 0.9185315 0.9255470 0.9328333    0
## 12 0.9093079 0.9178385 0.9246888 0.9233723 0.9309640 0.9321976    0
## 13 0.9233129 0.9307024 0.9340605 0.9338720 0.9383150 0.9444255    0
## 14 0.9287419 0.9352253 0.9398434 0.9398368 0.9434270 0.9532901    0
## 15 0.9386294 0.9430811 0.9475109 0.9491422 0.9545801 0.9628367    0
## 16 0.9458078 0.9495397 0.9529734 0.9542213 0.9597878 0.9635186    0
## 17 0.9515854 0.9556843 0.9587594 0.9593693 0.9642095 0.9682919    0
## 18 0.9519264 0.9562883 0.9613024 0.9607653 0.9656668 0.9696763    0
## 19 0.9498807 0.9575665 0.9631776 0.9628796 0.9673624 0.9741056    0
## 20 0.9567144 0.9601304 0.9708590 0.9679253 0.9734061 0.9778459    0
## 21 0.9706785 0.9719570 0.9759713 0.9751506 0.9770810 0.9809069    0
## 22 0.9723926 0.9750297 0.9768155 0.9770937 0.9791259 0.9819359    0
## 23 0.9734151 0.9760484 0.9769938 0.9776731 0.9804040 0.9819359    0
## 24 0.9737470 0.9762189 0.9780164 0.9780140 0.9799779 0.9819359    0
## 25 0.9751193 0.9779236 0.9786982 0.9795819 0.9827000 0.9839809    0
## 26 0.9737470 0.9786091 0.9798841 0.9796161 0.9822813 0.9843164    0
## 27 0.9758010 0.9796284 0.9812546 0.9811841 0.9837267 0.9849983    0
## 28 0.9771643 0.9788722 0.9809069 0.9810819 0.9834697 0.9853392    0
## 29 0.9774974 0.9800648 0.9809069 0.9817295 0.9841527 0.9863621    0
## 30 0.9781868 0.9799693 0.9819363 0.9821385 0.9846601 0.9856851    0
## 
## Kappa 
##         Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## 5  0.7793972 0.7864172 0.7996329 0.7971574 0.8076095 0.8098218    0
## 6  0.7799906 0.7868578 0.8016329 0.7980972 0.8071669 0.8117301    0
## 7  0.7811803 0.7881542 0.8033289 0.7989014 0.8083707 0.8113753    0
## 8  0.7838301 0.7910356 0.8055847 0.8085243 0.8146300 0.8450767    0
## 9  0.7876248 0.8273720 0.8376029 0.8343309 0.8453941 0.8618689    0
## 10 0.8483833 0.8515134 0.8592176 0.8639096 0.8713883 0.8890951    0
## 11 0.8369987 0.8526091 0.8701376 0.8667766 0.8782680 0.8904073    0
## 12 0.8516275 0.8656820 0.8769131 0.8747760 0.8872923 0.8891170    0
## 13 0.8746033 0.8868985 0.8923119 0.8920415 0.8994440 0.9094351    0
## 14 0.8836125 0.8943161 0.9016622 0.9017820 0.9077138 0.9238400    0
## 15 0.8998846 0.9071320 0.9141669 0.9169273 0.9257702 0.9392863    0
## 16 0.9117547 0.9178268 0.9235396 0.9255196 0.9345687 0.9407029    0
## 17 0.9212544 0.9279546 0.9330828 0.9339460 0.9417676 0.9485093    0
## 18 0.9218246 0.9290189 0.9370455 0.9362336 0.9441766 0.9506521    0
## 19 0.9185262 0.9311699 0.9401894 0.9397016 0.9469434 0.9579223    0
## 20 0.9296463 0.9351900 0.9526127 0.9479116 0.9568140 0.9640983    0
## 21 0.9523150 0.9544444 0.9609792 0.9596524 0.9628454 0.9689943    0
## 22 0.9551685 0.9594814 0.9623503 0.9628126 0.9661463 0.9707219    0
## 23 0.9567988 0.9610981 0.9626597 0.9637459 0.9682097 0.9706991    0
## 24 0.9573366 0.9613930 0.9642862 0.9642935 0.9675180 0.9707000    0
## 25 0.9595853 0.9641321 0.9653823 0.9668399 0.9719168 0.9740505    0
## 26 0.9573428 0.9652395 0.9673369 0.9669018 0.9712556 0.9745665    0
## 27 0.9607108 0.9668990 0.9695829 0.9694608 0.9736205 0.9756806    0
## 28 0.9628847 0.9656645 0.9690203 0.9692872 0.9732049 0.9762372    0
## 29 0.9634154 0.9676231 0.9690040 0.9703438 0.9743209 0.9779104    0
## 30 0.9645500 0.9674709 0.9706882 0.9710099 0.9751228 0.9768119    0

Result: 30

Step 4) Search the best ntrees

Now that you have the best value of mtry and maxnode, you can tune the number of trees. The method is exactly the same as maxnode.

start.time <- Sys.time()

store_maxtrees <- list()
for (ntree in c(250, 300, 350, 400, 450, 500, 550, 600, 800, 1000, 2000)) {
  set.seed(5678)
  rf_maxtrees <- train(neighbourhood_group~.,
                       data = train,
                       method = "rf",
                       metric = "Accuracy",
                       tuneGrid = tuneGrid,
                       trControl = trControl,
                       importance = TRUE,
                       nodesize = 14,
                       maxnodes = 24,
                       ntree = ntree)
  key <- toString(ntree)
  store_maxtrees[[key]] <- rf_maxtrees
}

end.time <- Sys.time()
time.taken <- end.time - start.time

print("-- Time: -- ")
## [1] "-- Time: -- "
time.taken
## Time difference of 20.17586 mins
print("")                                                          
## [1] ""
results_tree <- resamples(store_maxtrees)
summary(results_tree)
## 
## Call:
## summary.resamples(object = results_tree)
## 
## Models: 250, 300, 350, 400, 450, 500, 550, 600, 800, 1000, 2000 
## Number of resamples: 10 
## 
## Accuracy 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## 250  0.9754518 0.9769126 0.9783535 0.9785596 0.9797954 0.9822768    0
## 300  0.9761337 0.9773386 0.9781831 0.9788663 0.9799660 0.9826176    0
## 350  0.9754518 0.9775827 0.9778497 0.9788663 0.9802284 0.9826176    0
## 400  0.9754518 0.9769128 0.9781905 0.9787640 0.9801364 0.9826176    0
## 450  0.9757927 0.9775942 0.9780201 0.9789003 0.9802284 0.9826176    0
## 500  0.9757927 0.9775128 0.9778422 0.9787299 0.9798022 0.9826176    0
## 550  0.9754518 0.9771604 0.9776794 0.9786617 0.9798022 0.9829584    0
## 600  0.9757927 0.9772456 0.9778498 0.9786958 0.9797954 0.9826176    0
## 800  0.9757927 0.9775942 0.9778497 0.9787980 0.9798022 0.9826176    0
## 1000 0.9757927 0.9775013 0.9776794 0.9787299 0.9797954 0.9826176    0
## 2000 0.9754518 0.9772534 0.9780201 0.9786958 0.9798022 0.9822768    0
## 
## Kappa 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## 250  0.9600825 0.9624691 0.9648381 0.9651800 0.9672193 0.9712526    0
## 300  0.9611893 0.9631733 0.9645575 0.9656789 0.9674895 0.9718120    0
## 350  0.9600832 0.9635485 0.9640089 0.9656738 0.9679314 0.9718105    0
## 400  0.9600667 0.9624696 0.9645621 0.9655031 0.9677586 0.9718105    0
## 450  0.9606278 0.9635856 0.9642846 0.9657281 0.9679314 0.9718100    0
## 500  0.9606278 0.9634492 0.9640085 0.9654539 0.9672477 0.9718100    0
## 550  0.9600667 0.9629186 0.9637187 0.9653449 0.9672477 0.9723673    0
## 600  0.9606278 0.9630503 0.9640081 0.9654002 0.9672160 0.9718100    0
## 800  0.9606278 0.9635816 0.9640128 0.9655633 0.9672457 0.9718105    0
## 1000 0.9606278 0.9634492 0.9637324 0.9654556 0.9672160 0.9718105    0
## 2000 0.9600825 0.9630424 0.9642846 0.9654014 0.9672509 0.9712526    0

Step 5) Evaluate the model

fit_rf <- train(neighbourhood_group~.,
                train,
                method = "rf",
                metric = "Accuracy",
                tuneGrid = tuneGrid,
                trControl = trControl,
                importance = TRUE,
                nodesize = 14,
                ntree = 1000,
                maxnodes = 30)




prediction <-predict(fit_rf, test)

prediction_default <-predict(rf, test)

conf = confusionMatrix(prediction, test$neighbourhood_group)

conf_default = confusionMatrix(prediction_default, test$neighbourhood_group)

conf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2    3    4    5
##          1 7999    5  124    0    0
##          2    5 8697  171    0   22
##          3    4    0 1966    0    0
##          4    0    0    0  134    0
##          5    0   10    0    0  421
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9826          
##                  95% CI : (0.9806, 0.9844)
##     No Information Rate : 0.4454          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9717          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity            0.9989   0.9983   0.8695 1.000000  0.95034
## Specificity            0.9888   0.9817   0.9998 1.000000  0.99948
## Pos Pred Value         0.9841   0.9777   0.9980 1.000000  0.97680
## Neg Pred Value         0.9992   0.9986   0.9832 1.000000  0.99885
## Prevalence             0.4094   0.4454   0.1156 0.006851  0.02265
## Detection Rate         0.4090   0.4447   0.1005 0.006851  0.02153
## Detection Prevalence   0.4156   0.4548   0.1007 0.006851  0.02204
## Balanced Accuracy      0.9939   0.9900   0.9346 1.000000  0.97491
conf_default
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2    3    4    5
##          1 8005    2    7    0    0
##          2    1 8710    0    0    4
##          3    2    0 2253    0    0
##          4    0    0    0  134    0
##          5    0    0    1    0  439
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9991          
##                  95% CI : (0.9986, 0.9995)
##     No Information Rate : 0.4454          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9986          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity            0.9996   0.9998   0.9965 1.000000  0.99097
## Specificity            0.9992   0.9995   0.9999 1.000000  0.99995
## Pos Pred Value         0.9989   0.9994   0.9991 1.000000  0.99773
## Neg Pred Value         0.9997   0.9998   0.9995 1.000000  0.99979
## Prevalence             0.4094   0.4454   0.1156 0.006851  0.02265
## Detection Rate         0.4093   0.4453   0.1152 0.006851  0.02245
## Detection Prevalence   0.4098   0.4456   0.1153 0.006851  0.02250
## Balanced Accuracy      0.9994   0.9997   0.9982 1.000000  0.99546
#Step 6) Visualize Result

#varImpPlot(fit_rf)

RANGER RANDOM FOREST

library(ranger)
## Warning: package 'ranger' was built under R version 3.6.3
## 
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
## 
##     importance
library(tuneRanger)
## Warning: package 'tuneRanger' was built under R version 3.6.3
## Loading required package: mlrMBO
## Warning: package 'mlrMBO' was built under R version 3.6.3
## Loading required package: mlr
## Warning: package 'mlr' was built under R version 3.6.3
## Loading required package: ParamHelpers
## Warning: package 'ParamHelpers' was built under R version 3.6.3
## 'mlr' is in maintenance mode since July 2019. Future development
## efforts will go into its successor 'mlr3' (<https://mlr3.mlr-org.com>).
## 
## Attaching package: 'mlr'
## The following object is masked from 'package:e1071':
## 
##     impute
## The following object is masked from 'package:caret':
## 
##     train
## Loading required package: smoof
## Warning: package 'smoof' was built under R version 3.6.3
## Loading required package: checkmate
## Loading required package: lubridate
## Warning: package 'lubridate' was built under R version 3.6.3
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:dplyr':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:cowplot':
## 
##     stamp
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## Loading required package: lhs
## Warning: package 'lhs' was built under R version 3.6.3
ranger <- ranger( neighbourhood_group~ ., data = train, write.forest = TRUE, classification = T)

ranger_pred = predict(ranger, data = test)

tab = table(test$neighbourhood_group, predictions(ranger_pred))

plot(tab)

rangerReg <- ranger( price~ ., data = train, write.forest = TRUE, classification = F)
rangerReg
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = train, write.forest = TRUE, classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      29337 
## Number of independent variables:  4 
## Mtry:                             2 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.8986566 
## R squared (OOB):                  0.08119909
rangerReg_pred = predict(rangerReg, data = test)
rangerReg_pred
## Ranger prediction
## 
## Type:                             Regression 
## Sample size:                      19558 
## Number of independent variables:  4
tab = table(test$price, predictions(rangerReg_pred))
#tab
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggmap':
## 
##     wind
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(x = test$price, y = predictions(rangerReg_pred))
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

RANGER TUNING

library(tuneRanger)

# https://github.com/PhilippPro/tuneRanger
# https://mlr.mlr-org.com/articles/tutorial/measures.html
task = makeRegrTask(data = train, target = "price")
estimateTimeTuneRanger(task, iters = 20, num.threads = 8, num.trees = 1000)
## Approximated time for tuning: 4M 3S
res = tuneRanger(task, measure = list(mse), num.trees = 1000, 
                 num.threads = 8, iters = 20,  show.info = getOption("mlrMBO.show.info", TRUE))
## Computing y column(s) for design. Not provided.
## [mbo] 0: mtry=1; min.node.size=230; sample.fraction=0.777 : y = 0.885 : 3.0 secs : initdesign
## [mbo] 0: mtry=1; min.node.size=933; sample.fraction=0.449 : y = 0.886 : 2.3 secs : initdesign
## [mbo] 0: mtry=3; min.node.size=8; sample.fraction=0.749 : y = 0.979 : 27.0 secs : initdesign
## [mbo] 0: mtry=2; min.node.size=19; sample.fraction=0.549 : y = 0.889 : 8.8 secs : initdesign
## [mbo] 0: mtry=3; min.node.size=56; sample.fraction=0.654 : y = 0.911 : 17.3 secs : initdesign
## [mbo] 0: mtry=2; min.node.size=1.1e+03; sample.fraction=0.233 : y = 0.885 : 2.4 secs : initdesign
## [mbo] 0: mtry=4; min.node.size=129; sample.fraction=0.373 : y = 0.888 : 9.3 secs : initdesign
## [mbo] 0: mtry=2; min.node.size=2.18e+03; sample.fraction=0.509 : y = 0.885 : 2.4 secs : initdesign
## [mbo] 0: mtry=4; min.node.size=6; sample.fraction=0.809 : y = 1.11 : 36.1 secs : initdesign
## [mbo] 0: mtry=1; min.node.size=2; sample.fraction=0.463 : y = 0.885 : 2.6 secs : initdesign
## [mbo] 0: mtry=3; min.node.size=66; sample.fraction=0.223 : y = 0.878 : 5.0 secs : initdesign
## [mbo] 0: mtry=3; min.node.size=338; sample.fraction=0.359 : y = 0.884 : 4.0 secs : initdesign
## [mbo] 0: mtry=3; min.node.size=4; sample.fraction=0.334 : y = 0.91 : 11.6 secs : initdesign
## [mbo] 0: mtry=1; min.node.size=11; sample.fraction=0.485 : y = 0.884 : 2.9 secs : initdesign
## [mbo] 0: mtry=4; min.node.size=561; sample.fraction=0.7 : y = 0.902 : 7.9 secs : initdesign
## [mbo] 0: mtry=4; min.node.size=5.02e+03; sample.fraction=0.598 : y = 0.894 : 2.3 secs : initdesign
## [mbo] 0: mtry=2; min.node.size=282; sample.fraction=0.572 : y = 0.882 : 5.2 secs : initdesign
## [mbo] 0: mtry=2; min.node.size=35; sample.fraction=0.717 : y = 0.891 : 8.6 secs : initdesign
## [mbo] 0: mtry=3; min.node.size=32; sample.fraction=0.398 : y = 0.895 : 10.8 secs : initdesign
## [mbo] 0: mtry=1; min.node.size=3.01e+03; sample.fraction=0.674 : y = 0.89 : 2.0 secs : initdesign
## [mbo] 0: mtry=3; min.node.size=4; sample.fraction=0.635 : y = 0.971 : 26.8 secs : initdesign
## [mbo] 0: mtry=4; min.node.size=16; sample.fraction=0.59 : y = 0.949 : 24.4 secs : initdesign
## [mbo] 0: mtry=2; min.node.size=4e+03; sample.fraction=0.868 : y = 0.886 : 2.9 secs : initdesign
## [mbo] 0: mtry=1; min.node.size=177; sample.fraction=0.839 : y = 0.885 : 3.1 secs : initdesign
## [mbo] 0: mtry=2; min.node.size=6; sample.fraction=0.251 : y = 0.883 : 6.2 secs : initdesign
## [mbo] 0: mtry=4; min.node.size=1.5e+03; sample.fraction=0.303 : y = 0.89 : 2.7 secs : initdesign
## [mbo] 0: mtry=1; min.node.size=3; sample.fraction=0.799 : y = 0.885 : 3.3 secs : initdesign
## [mbo] 0: mtry=4; min.node.size=82; sample.fraction=0.898 : y = 0.999 : 20.8 secs : initdesign
## [mbo] 0: mtry=4; min.node.size=2; sample.fraction=0.275 : y = 0.916 : 14.1 secs : initdesign
## [mbo] 0: mtry=1; min.node.size=759; sample.fraction=0.415 : y = 0.886 : 2.4 secs : initdesign
## [mbo] 1: mtry=1; min.node.size=3; sample.fraction=0.201 : y = 0.885 : 3.0 secs : infill_cb
## [mbo] 2: mtry=4; min.node.size=44; sample.fraction=0.212 : y = 0.883 : 8.5 secs : infill_cb
## [mbo] 3: mtry=4; min.node.size=496; sample.fraction=0.514 : y = 0.895 : 6.4 secs : infill_cb
## [mbo] 4: mtry=2; min.node.size=866; sample.fraction=0.719 : y = 0.883 : 4.2 secs : infill_cb
## [mbo] 5: mtry=2; min.node.size=99; sample.fraction=0.293 : y = 0.879 : 4.6 secs : infill_cb
## [mbo] 6: mtry=1; min.node.size=15; sample.fraction=0.703 : y = 0.885 : 3.2 secs : infill_cb
## [mbo] 7: mtry=3; min.node.size=4.66e+03; sample.fraction=0.215 : y = 0.907 : 1.6 secs : infill_cb
## [mbo] 8: mtry=2; min.node.size=29; sample.fraction=0.201 : y = 0.88 : 4.7 secs : infill_cb
## [mbo] 9: mtry=3; min.node.size=194; sample.fraction=0.203 : y = 0.882 : 3.9 secs : infill_cb
## [mbo] 10: mtry=1; min.node.size=2; sample.fraction=0.9 : y = 0.885 : 3.1 secs : infill_cb
## [mbo] 11: mtry=1; min.node.size=16; sample.fraction=0.275 : y = 0.885 : 2.5 secs : infill_cb
## [mbo] 12: mtry=1; min.node.size=24; sample.fraction=0.889 : y = 0.885 : 3.2 secs : infill_cb
## [mbo] 13: mtry=1; min.node.size=5.34e+03; sample.fraction=0.304 : y = 0.921 : 1.9 secs : infill_cb
## [mbo] 14: mtry=1; min.node.size=2.33e+03; sample.fraction=0.898 : y = 0.888 : 1.9 secs : infill_cb
## [mbo] 15: mtry=2; min.node.size=525; sample.fraction=0.385 : y = 0.882 : 3.7 secs : infill_cb
## [mbo] 16: mtry=1; min.node.size=98; sample.fraction=0.564 : y = 0.885 : 2.6 secs : infill_cb
## [mbo] 17: mtry=3; min.node.size=24; sample.fraction=0.242 : y = 0.885 : 8.4 secs : infill_cb
## [mbo] 18: mtry=2; min.node.size=69; sample.fraction=0.225 : y = 0.879 : 5.0 secs : infill_cb
## [mbo] 19: mtry=1; min.node.size=2; sample.fraction=0.33 : y = 0.885 : 2.7 secs : infill_cb
## [mbo] 20: mtry=3; min.node.size=97; sample.fraction=0.276 : y = 0.88 : 7.4 secs : infill_cb

Mean of best 5 % of the results

res
## Recommended parameter settings: 
##   mtry min.node.size sample.fraction
## 1    2            78       0.2467632
## Results: 
##         mse exec.time
## 1 0.8787459      4.84

Recommended parameter settings: mtry min.node.size sample.fraction 1 2 55 0.2136541 Results: mse exec.time 1 0.933998 2.756667

Model with the new tuned hyperparameters

res$model
## Model for learner.id=regr.ranger; learner.class=regr.ranger
## Trained on: task.id = train; obs = 29337; features = 4
## Hyperparameters: num.threads=8,verbose=FALSE,respect.unordered.factors=order,mtry=2,min.node.size=78,sample.fraction=0.247,num.trees=1e+03,replace=FALSE
tuned_rangerReg <- ranger( price~ ., data = train, write.forest = TRUE, classification = F, mtry= 2, 
                           min.node.size = 55, sample.fraction = 0.214,num.trees = 1000, replace= FALSE)
tuned_rangerReg
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = train, write.forest = TRUE, classification = F,      mtry = 2, min.node.size = 55, sample.fraction = 0.214, num.trees = 1000,      replace = FALSE) 
## 
## Type:                             Regression 
## Number of trees:                  1000 
## Sample size:                      29337 
## Number of independent variables:  4 
## Mtry:                             2 
## Target node size:                 55 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.8787042 
## R squared (OOB):                  0.1015987
tuned_rangerReg_pred = predict(tuned_rangerReg, data = test)
tuned_rangerReg_pred
## Ranger prediction
## 
## Type:                             Regression 
## Sample size:                      19558 
## Number of independent variables:  4

ROCK CURVE

cm = table(test[,14], pred)

library(caret) confusionMatrix(as.factor(pred), as.factor(test$price))

confusionMatrix( factor(pred, levels = 1:19558), factor(test$price, levels = 1:19558) )

library(ROCR)
## Warning: package 'ROCR' was built under R version 3.6.3
## 
## Attaching package: 'ROCR'
## The following object is masked from 'package:mlr':
## 
##     performance
#ROCR
#roc_pred = predict(tuned_rangerReg,data = test, type="response")
#perf = prediction(roc_pred$predictions, test)

#==
#FIND OPTIMAL VALUE WITH MIN OUT OF BAG
#https://www.listendata.com/2014/11/random-forest-with-r.html

DECISION TREE ——– TO BE DONE

library(rpart) library(caret) library(e1071)

dt = train( neighbourhood_group~ ., data = train,method = “rpart”) pred2 = predict(dt, data = test) table(pred2, test)

NEURAL NETWORKS ??

https://medium.com/@brscntyz/neural-network-in-r-e275302b6e44 https://datascienceplus.com/fitting-neural-network-in-r/ https://www.kdnuggets.com/2016/08/begineers-guide-neural-networks-r.html/2

library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## v purrr   0.3.4
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'stringr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x purrr::accumulate()      masks foreach::accumulate()
## x lubridate::as.difftime() masks base::as.difftime()
## x randomForest::combine()  masks dplyr::combine()
## x lubridate::date()        masks base::date()
## x plotly::filter()         masks dplyr::filter(), stats::filter()
## x lubridate::intersect()   masks base::intersect()
## x dplyr::lag()             masks stats::lag()
## x purrr::lift()            masks caret::lift()
## x randomForest::margin()   masks ggplot2::margin()
## x lubridate::setdiff()     masks base::setdiff()
## x lubridate::stamp()       masks cowplot::stamp()
## x lubridate::union()       masks base::union()
## x purrr::when()            masks foreach::when()
library("keras")
## Warning: package 'keras' was built under R version 3.6.3
## 
## Attaching package: 'keras'
## The following object is masked _by_ '.GlobalEnv':
## 
##     normalize
## The following object is masked from 'package:future':
## 
##     %<-%
library(neuralnet)
## Warning: package 'neuralnet' was built under R version 3.6.3
## 
## Attaching package: 'neuralnet'
## The following object is masked from 'package:ROCR':
## 
##     prediction
## The following object is masked from 'package:dplyr':
## 
##     compute
library(Hmisc)
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:future':
## 
##     cluster
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:plotly':
## 
##     subplot
## The following object is masked from 'package:mlr':
## 
##     impute
## The following object is masked from 'package:e1071':
## 
##     impute
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
m <- model.matrix( 
  ~price+neighbourhood_group+room_type+longitude+latitude,
  data = train 
)

m_test  <- model.matrix( 
  ~price+neighbourhood_group+room_type+longitude+latitude,
  data = test 
)

head(m)
##    (Intercept)       price neighbourhood_group2 neighbourhood_group3
## 1            1 -0.01549291                    0                    0
## 3            1 -0.01132892                    1                    0
## 5            1 -0.30280835                    1                    0
## 6            1  0.19687067                    1                    0
## 8            1 -0.30697234                    1                    0
## 10           1 -0.01132892                    1                    0
##    neighbourhood_group4 neighbourhood_group5 room_type2 room_type3  longitude
## 1                     0                    0          0          0 -0.4376476
## 3                     0                    0          0          0  0.2224944
## 5                     0                    0          1          0  0.1772139
## 6                     0                    0          1          0 -0.4946274
## 8                     0                    0          0          0 -0.7097639
## 10                    0                    0          1          0 -0.8276232
##      latitude
## 1  -1.4938339
## 3   1.4683845
## 5   1.2756468
## 6   0.3433173
## 8   0.6591063
## 10 -0.2844097
#nn=neuralnet(price~ neighbourhood_group2+ neighbourhood_group3 +neighbourhood_group4+ neighbourhood_group5+ room_type2+ room_type3+longitude+latitude,data=m, hidden=10,act.fct = "logistic",
#             linear.output = TRUE,stepmax=10^5,threshold = 0.01)

Also you can change your hidden layers by specifiying with numbers in vector like this

#nn=neuralnet( price~ neighbourhood_group2+ neighbourhood_group3 +neighbourhood_group4+ neighbourhood_group5+ room_type2+ room_type3+longitude+latitude,data=m, hidden=c(7,6,5),act.fct = "logistic",
#           linear.output = TRUE,stepmax=10^5,threshold = 0.01)

#hidden=c(7,6,5)

Then, prediction and calculation of error comes. I calculate the error with Root mean error method.

nn_pred=compute(nn,test[,1:13]) nn_pred\(net.resultRMSE <- function(actual,predicted) { return(sqrt(sum(actual^2-predicted^2)/length(actual))) } summary(nn_pred) nn_pred <- is.numeric(nn_pred) RMSE(test\)price,nn_pred)

plot(test\(price,nn_pred\)net.result)

pr.nn_ <- nn_pred\(net.result*(max(dataset\)price)-min(dataset\(price))+min(dataset\)price)

LINEAR REGRESSION

https://datascienceplus.com/fitting-neural-network-in-r/

lm.fit <- glm(price~., data=train)
summary(lm.fit)
## 
## Call:
## glm(formula = price ~ ., data = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.009  -0.254  -0.102   0.044  41.017  
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -0.361387   0.012348 -29.267  < 2e-16 ***
## neighbourhood_group2  0.271929   0.019299  14.091  < 2e-16 ***
## neighbourhood_group3  0.112169   0.023815   4.710 2.49e-06 ***
## neighbourhood_group4 -0.463934   0.068425  -6.780 1.22e-11 ***
## neighbourhood_group5  0.189675   0.046677   4.064 4.85e-05 ***
## latitude             -0.054288   0.009057  -5.994 2.07e-09 ***
## longitude            -0.080103   0.008788  -9.115  < 2e-16 ***
## room_type2            0.440940   0.011498  38.348  < 2e-16 ***
## room_type3           -0.091403   0.036869  -2.479   0.0132 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.893665)
## 
##     Null deviance: 28693  on 29336  degrees of freedom
## Residual deviance: 26209  on 29328  degrees of freedom
## AIC: 79968
## 
## Number of Fisher Scoring iterations: 2
pr.lm <- predict(lm.fit,test)
MSE.lm <- sum((pr.lm - test$price)^2)/nrow(test)
par(mfrow=c(1,2))
#plot(test$price,nn_pred$net.result,col='red',main='Real vs predicted NN',pch=18,cex=0.7)
#abline(0,1,lwd=2)
#legend('bottomright',legend='NN',pch=18,col='red', bty='n')
plot(test$price,pr.lm,col='blue',main='Real vs predicted Linear Regression',pch=18, cex=0.7)
abline(0,1,lwd=2)
legend('bottomright',legend='LM',pch=18,col='blue', bty='n', cex=.95)